Website: www.CountryCode.org
Data Composition: Country Name, Country Telephone Code, ISO Code, Population Amount, Area km2, GDP in USD
Running Time: 3.1 sec
Project: Non-commercial use
Author: Pedro Sanhueza
import requests # get html from URL
from bs4 import BeautifulSoup # find elements in html
import pandas as pd # build data frame
import plotly.express as px # display plots
from datetime import datetime # to save file with current time
import plotly.io as pio
pio.renderers.default='notebook'
response = requests.get("https://countrycode.org/") # read data from URL
soup = BeautifulSoup(response.text, 'html.parser') # convert web data to HTML
ls = [x.get_text() for x in soup.select('td')][:240*6] # collect all values of table into a list
data = { # get the 6th item in list starting from 1, 2, 3, 4, 5, and 6th element
'Country' :ls[0::6],
'Country_code' : ls[1::6],
'ISO_codes' : ls[2::6],
'Population' : ls[3::6],
'Area_KM2' : ls[4::6],
'GDP_USD' : ls[5::6]
}
df = pd.DataFrame(data) # build data frame
df # showcase the extraction of the website table
| Country | Country_code | ISO_codes | Population | Area_KM2 | GDP_USD | |
|---|---|---|---|---|---|---|
| 0 | Afghanistan | 93 | AF / AFG | 29,121,286 | 647,500 | 20.65 Billion |
| 1 | Albania | 355 | AL / ALB | 2,986,952 | 28,748 | 12.8 Billion |
| 2 | Algeria | 213 | DZ / DZA | 34,586,184 | 2,381,740 | 215.7 Billion |
| 3 | American Samoa | 1-684 | AS / ASM | 57,881 | 199 | 462.2 Million |
| 4 | Andorra | 376 | AD / AND | 84,000 | 468 | 4.8 Billion |
| ... | ... | ... | ... | ... | ... | ... |
| 235 | Wallis and Futuna | 681 | WF / WLF | 16,025 | 274 | |
| 236 | Western Sahara | 212 | EH / ESH | 273,008 | 266,000 | |
| 237 | Yemen | 967 | YE / YEM | 23,495,361 | 527,970 | 43.89 Billion |
| 238 | Zambia | 260 | ZM / ZMB | 13,460,305 | 752,614 | 22.24 Billion |
| 239 | Zimbabwe | 263 | ZW / ZWE | 11,651,858 | 390,580 | 10.48 Billion |
240 rows × 6 columns
# Change 'GDP_USD' column integers
def gdp_value(x): # from str to int
try:
y = str(x).split(' ') # divide string in two
z = float(y[0]) * float(y[1]) # multiply the original value with the replacement amount
return int(z) # return the integer of the multiplication
except:
return "No Value Found"
replacements = {'Billion':'1000000000', 'Million':'1000000', 'Trillion': '1000000000000'} # key items to be replaced
df['GDP_USD'] = [ gdp_value(x) for x in df.GDP_USD.replace(replacements, regex=True)] # change from strings to integers
df1 = df[(df['GDP_USD'] != "No Value Found")].sort_values(by=['GDP_USD'] )[-10:].copy() # get the top 10 in order
px.bar(df1, x='Country', y='GDP_USD', title="GDP per Country", text_auto=True) # display bar chart
df['Population'] = [ int(x) for x in df.Population.replace(',','', regex=True)] # change from strings to integers
df2 = df.sort_values(by=['Population'], ascending=False)[:10].copy() # get the top 10 in order
px.bar(df2, x='Country', y='Population', title="Top 10 Countries Population Count", text_auto=True, color='Population') # display bar chart
df['Area_KM2'] = [int(x) for x in df.Area_KM2.replace(',','', regex=True)] # change from strings to integers
fig = px.scatter(df, x="Area_KM2", y="Population", color='Area_KM2', text='Country', title="Area vs Population Amount") # build scatter plot
fig.update_traces(textposition='top center') # display plot
# optional:
file_path = '../Country Code - Historical Data/Country Code ' + datetime.now().strftime("%d-%m-%Y %H%M%S") + ".csv" # folder location with file name
df.to_csv(file_path) # save data frame as csv in file location